In case of using Google Colab
# from google.colab import drive
# drive.mount("/content/drive")
In case of running in local
import os
import sys
# Taking the path of this .ipynb
os.getcwd()
'c:\\Users\\maria\\OneDrive\\Escritorio\\The_Bridge\\Visual_Studio_Code\\Python\\Prueba_t-cnica_WA\\src'
# Jumping to the previous folder
rooth_path = os.path.dirname(os.getcwd())
rooth_path
'c:\\Users\\maria\\OneDrive\\Escritorio\\The_Bridge\\Visual_Studio_Code\\Python\\Prueba_t-cnica_WA'
# Adding the previous folder to the sys.path so that anyone can run this script
sys.path.append(rooth_path)
sys.path = list(set(sys.path)) # To avoid duplicates in each run
sys.path
['', 'C:\\Program Files\\Python38\\lib\\site-packages\\win32', 'c:\\Users\\maria\\OneDrive\\Escritorio\\The_Bridge\\Visual_Studio_Code\\Python\\Prueba_t-cnica_WA', 'c:\\Users\\maria\\.vscode\\extensions\\ms-toolsai.jupyter-2021.5.745244803\\pythonFiles\\lib\\python', 'C:\\Program Files\\Python38\\lib\\site-packages', 'C:\\Program Files\\Python38\\lib\\site-packages\\win32\\lib', 'C:\\Program Files\\Python38\\DLLs', 'C:\\Program Files\\Python38\\lib', 'C:\\Program Files\\Python38', 'C:\\Program Files\\Python38\\python38.zip', 'C:\\Program Files\\Python38\\lib\\site-packages\\IPython\\extensions', 'c:\\Users\\maria\\.vscode\\extensions\\ms-toolsai.jupyter-2021.5.745244803\\pythonFiles', 'C:\\Users\\maria\\.ipython', 'c:\\Users\\maria\\OneDrive\\Escritorio\\The_Bridge\\Visual_Studio_Code\\Python\\Prueba_t-cnica_WA\\src', 'C:\\Program Files\\Python38\\lib\\site-packages\\Pythonwin']
In case of using Google Colab
# import pandas as pd
# from pandas_profiling import ProfileReport
# pd.options.mode.chained_assignment = None # default='warn'
# from drive.MyDrive.Prueba_Tecnica.Prueba_tcnica_WA.src.utils.mining_data_tb import changing_dates, split_column, check_dupli_nan
# from drive.MyDrive.Prueba_Tecnica.Prueba_tcnica_WA.src.utils.visualization_tb import histogram_figure, linediagram_figure, correlation_matrix, piechart_figure_6labels
In case of running in local
import pandas as pd
from pandas_profiling import ProfileReport
pd.options.mode.chained_assignment = None # default='warn'
from src.utils.mining_data_tb import changing_dates, split_column, check_dupli_nan
from src.utils.visualization_tb import histogram_figure, linediagram_figure, correlation_matrix, piechart_figure_6labels
sellin = pd.read_csv("../data/sellin.csv", sep=",", low_memory=False)
sellout_prov1 = pd.read_csv("../data/sellout-proveedor1.csv", sep=",", low_memory=False)
sellout_prov2 = pd.read_csv("../data/sellout-proveedor2.csv", sep=",", low_memory=False)
maestro_prod = pd.read_csv("../data/maestro_productos_prueba.csv", sep=",")
maestro_client = pd.read_excel("../data/maestro_clientes_prueba.xlsx", engine='openpyxl')
print(sellin.shape)
print(sellout_prov1.shape)
print(sellout_prov2.shape)
print(maestro_prod.shape)
print(maestro_client.shape)
(4188783, 6) (1206365, 5) (4862029, 5) (1242, 4) (2452, 10)
# Sellin: purchase from our client to supliers EX: Danone to Carrefour
sellin.head()
| Sku | Date | Store_Id | Pieces | Price | Margin | |
|---|---|---|---|---|---|---|
| 0 | 4220015 | 02/01/2015 | 100 | 20.0 | 13.01 | 9.78 |
| 1 | 4220015 | 02/01/2015 | 116 | 20.0 | 13.01 | 9.78 |
| 2 | 4220015 | 02/01/2015 | 141 | 80.0 | 13.01 | 9.78 |
| 3 | 4220015 | 02/01/2015 | 146 | 10.0 | 13.01 | 9.78 |
| 4 | 4220015 | 02/01/2015 | 151 | 20.0 | 13.09 | 9.87 |
# Sellout_prov1: purchase from the first suplier to final client EX: Carrefour to final user
sellout_prov1.head()
| Sku | Date | Store_Id | Units | Price | |
|---|---|---|---|---|---|
| 0 | 4220015 | 2017-01-01 SO Diaria Piezas | 18 | 6.0 | 15.6 |
| 1 | 4220015 | 2017-01-01 SO Diaria Piezas | 24 | 0.0 | 0.0 |
| 2 | 4220015 | 2017-01-01 SO Diaria Piezas | 39 | 0.0 | 0.0 |
| 3 | 4220015 | 2017-01-01 SO Diaria Piezas | 44 | 0.0 | 0.0 |
| 4 | 4220015 | 2017-01-01 SO Diaria Piezas | 46 | 0.0 | 0.0 |
# Sellout_prov2: purchase from the second suplier to final client EX: Mercadona to final user
sellout_prov2.head()
| Sku | Date | Store_Id | Pieces | Price | |
|---|---|---|---|---|---|
| 0 | 4240039 | 2016/01/01 | 1000 | 23.0 | $16.00 |
| 1 | 4240039 | 2016/01/01 | 1001 | 116.0 | $16.00 |
| 2 | 4240039 | 2016/01/01 | 1002 | 146.0 | $16.00 |
| 3 | 4240039 | 2016/01/01 | 1003 | 17.0 | $16.80 |
| 4 | 4240039 | 2016/01/01 | 1004 | 5.0 | $16.70 |
# Type of produts of our client and competitors
maestro_prod.head()
| Sku | Sku_Description | Sku_Category | Sku_Subcategory | |
|---|---|---|---|---|
| 0 | 11058 | Frosted Mango | Sugar-free | Cereal bars |
| 1 | 29207 | Nut Cheerios Mango | Super-flavour | Cereal box |
| 2 | 38379 | MultiGrain Mango | Sugar-free | Cereal bars |
| 3 | 42671 | Crunch Mango | Sugar-free | Cereal bars |
| 4 | 54644 | Corn Mango | Sugar-free | Cereal bars |
# Shops and refer suppliers which sell their products
maestro_client.head()
| NUM TDA | NOMBRE DE TDA | CADENA | FORMATO | MUNICIPIO/CIUDAD | ESTADO | DIVISION | DEPOSITO / DISTRIBUIDOR | ESTRUCTURA | GRADO | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | Bloody Sea Green BookShop | proveedor_1 | Shop | Bizarre White Shop | Coalmar | SURESTE | Mogulval Corp. | DISTRIBUIDORES | VI |
| 1 | 4 | Funny Sky Blue GipsyTrade | proveedor_1 | Shop | Bizarre White Shop | Coalmar | SURESTE | Mogulval Corp. | DISTRIBUIDORES | VI |
| 2 | 5 | Rare Green Laundrette | proveedor_1 | Shop | Rare Orange Applepie | Coalmar | SURESTE | Eioval Aerospace | FORANEO | VI |
| 3 | 6 | Angry Indigo Center | proveedor_1 | Shop | Rare Orange Applepie | Coalmar | SURESTE | Eioval Aerospace | FORANEO | VI |
| 4 | 7 | Tall Violet Deli | proveedor_1 | Shop | Rare Orange Applepie | Coalmar | SURESTE | Eioval Aerospace | FORANEO | VI |
# Here we confirm we have only 2 different suppliers
maestro_client.CADENA.unique()
array(['proveedor_1', 'proveedor_2'], dtype=object)
# Here we saw that the column Margin doen't vary depending on the column pieces.
sellin[(sellin["Pieces"] == 0) | (sellin["Pieces"] == 5)]
| Sku | Date | Store_Id | Pieces | Price | Margin | |
|---|---|---|---|---|---|---|
| 22 | 4220015 | 02/01/2015 | 1150 | 5.0 | 12.59 | 9.36 |
| 23 | 4220015 | 02/01/2015 | 1160 | 5.0 | 12.59 | 9.36 |
| 24 | 4220015 | 02/01/2015 | 1161 | 0.0 | 12.59 | 9.36 |
| 32 | 4220015 | 02/01/2015 | 1497 | 5.0 | 12.72 | 9.49 |
| 74 | 4220015 | 02/01/2015 | 3765 | 5.0 | 12.59 | 9.36 |
| ... | ... | ... | ... | ... | ... | ... |
| 4188052 | 4829827 | 17/05/2017 | 2093 | 0.0 | 20.51 | 14.62 |
| 4188207 | 4829827 | 19/05/2017 | 3776 | 0.0 | 20.51 | 14.62 |
| 4188281 | 4829827 | 20/05/2017 | 3834 | 0.0 | 20.51 | 14.62 |
| 4188404 | 4829827 | 23/05/2017 | 3857 | 0.0 | 20.51 | 14.62 |
| 4188601 | 4829827 | 29/05/2017 | 1124 | 0.0 | 20.51 | 14.62 |
70882 rows × 6 columns
Questions about:
# Here I imported a function from src.utils.mining_data_tb to check if there is any duplicate or nan by each DataFrame
check_dupli_nan("sellin", sellin)
check_dupli_nan("sellout_prov1", sellout_prov1)
check_dupli_nan("sellout_prov2", sellout_prov2)
check_dupli_nan("maestro_prod", maestro_prod)
check_dupli_nan("maestro_client", maestro_client)
sellin duplicates ----> True sellin null ----> False -------------------------- sellout_prov1 duplicates ----> False sellout_prov1 null ----> False -------------------------- sellout_prov2 duplicates ----> True sellout_prov2 null ----> False -------------------------- maestro_prod duplicates ----> False maestro_prod null ----> False -------------------------- maestro_client duplicates ----> False maestro_client null ----> False --------------------------
# Here I droped the duplicates and I did again the checkup
sellin = sellin.drop_duplicates()
check_dupli_nan("sellin", sellin)
sellout_prov2 = sellout_prov2.drop_duplicates()
check_dupli_nan("sellout_prov2", sellout_prov2)
sellin duplicates ----> False sellin null ----> False -------------------------- sellout_prov2 duplicates ----> False sellout_prov2 null ----> False --------------------------
# Remembering how the dataframe is
sellin.head()
| Sku | Date | Store_Id | Pieces | Price | Margin | |
|---|---|---|---|---|---|---|
| 0 | 4220015 | 02/01/2015 | 100 | 20.0 | 13.01 | 9.78 |
| 1 | 4220015 | 02/01/2015 | 116 | 20.0 | 13.01 | 9.78 |
| 2 | 4220015 | 02/01/2015 | 141 | 80.0 | 13.01 | 9.78 |
| 3 | 4220015 | 02/01/2015 | 146 | 10.0 | 13.01 | 9.78 |
| 4 | 4220015 | 02/01/2015 | 151 | 20.0 | 13.09 | 9.87 |
# Checking what type of format are the columns
sellin.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4188765 entries, 0 to 4188782 Data columns (total 6 columns): # Column Dtype --- ------ ----- 0 Sku int64 1 Date object 2 Store_Id int64 3 Pieces float64 4 Price float64 5 Margin float64 dtypes: float64(3), int64(2), object(1) memory usage: 223.7+ MB
# Imported the function from src.utils.mining_data_tb to change the date from object to datetime format
changing_dates(df=sellin, name_column="Date")
# Checking the change of the Dtype of Date column was done well
sellin.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4188765 entries, 0 to 4188782 Data columns (total 6 columns): # Column Dtype --- ------ ----- 0 Sku int64 1 Date datetime64[ns] 2 Store_Id int64 3 Pieces float64 4 Price float64 5 Margin float64 dtypes: datetime64[ns](1), float64(3), int64(2) memory usage: 223.7 MB
# A fundamental part is select the product from X Date to Y Date, so it's important to sort the DataFrame by Date
# Here we can see that have been sold products from 2015 to 2017
sellin = sellin.sort_values("Date")
sellin
| Sku | Date | Store_Id | Pieces | Price | Margin | |
|---|---|---|---|---|---|---|
| 1239443 | 4361050 | 2015-01-04 | 18 | 15.12 | 12.99 | 10.67 |
| 3228928 | 4769520 | 2015-01-04 | 3372 | 4.80 | 35.54 | 27.38 |
| 3228927 | 4769520 | 2015-01-04 | 3344 | 7.20 | 35.54 | 27.38 |
| 3228926 | 4769520 | 2015-01-04 | 3323 | 7.20 | 35.54 | 27.38 |
| 3228925 | 4769520 | 2015-01-04 | 3318 | 4.80 | 35.54 | 27.38 |
| ... | ... | ... | ... | ... | ... | ... |
| 774678 | 4250046 | 2017-12-05 | 1086 | 12.00 | 15.17 | 10.66 |
| 774679 | 4250046 | 2017-12-05 | 1108 | 108.00 | 15.17 | 10.66 |
| 774680 | 4250046 | 2017-12-05 | 1110 | 48.00 | 15.17 | 10.66 |
| 774682 | 4250046 | 2017-12-05 | 1150 | 12.00 | 15.17 | 10.66 |
| 2103459 | 4523924 | 2017-12-05 | 1119 | 4.80 | 43.41 | 29.84 |
4188765 rows × 6 columns
# Here rename the column Pieces to Units as in the other csv
sellin = sellin.rename(columns={'Pieces': 'Units'})
# Deleting the units equal to 0 because they don't provide information
sellin = sellin.loc[(sellin["Units"] != 0)]
sellin
| Sku | Date | Store_Id | Units | Price | Margin | |
|---|---|---|---|---|---|---|
| 1239443 | 4361050 | 2015-01-04 | 18 | 15.12 | 12.99 | 10.67 |
| 3228928 | 4769520 | 2015-01-04 | 3372 | 4.80 | 35.54 | 27.38 |
| 3228927 | 4769520 | 2015-01-04 | 3344 | 7.20 | 35.54 | 27.38 |
| 3228926 | 4769520 | 2015-01-04 | 3323 | 7.20 | 35.54 | 27.38 |
| 3228925 | 4769520 | 2015-01-04 | 3318 | 4.80 | 35.54 | 27.38 |
| ... | ... | ... | ... | ... | ... | ... |
| 774678 | 4250046 | 2017-12-05 | 1086 | 12.00 | 15.17 | 10.66 |
| 774679 | 4250046 | 2017-12-05 | 1108 | 108.00 | 15.17 | 10.66 |
| 774680 | 4250046 | 2017-12-05 | 1110 | 48.00 | 15.17 | 10.66 |
| 774682 | 4250046 | 2017-12-05 | 1150 | 12.00 | 15.17 | 10.66 |
| 2103459 | 4523924 | 2017-12-05 | 1119 | 4.80 | 43.41 | 29.84 |
4141818 rows × 6 columns
# Remembering how the dataframe is
sellout_prov1.head()
| Sku | Date | Store_Id | Units | Price | |
|---|---|---|---|---|---|
| 0 | 4220015 | 2017-01-01 SO Diaria Piezas | 18 | 6.0 | 15.6 |
| 1 | 4220015 | 2017-01-01 SO Diaria Piezas | 24 | 0.0 | 0.0 |
| 2 | 4220015 | 2017-01-01 SO Diaria Piezas | 39 | 0.0 | 0.0 |
| 3 | 4220015 | 2017-01-01 SO Diaria Piezas | 44 | 0.0 | 0.0 |
| 4 | 4220015 | 2017-01-01 SO Diaria Piezas | 46 | 0.0 | 0.0 |
# Checking if SO Diaria Piezas is in all the cells of Date column
sellout_prov1.Date.unique()
array(['2017-01-01 SO Diaria Piezas', '2017-01-02 SO Diaria Piezas',
'2017-01-03 SO Diaria Piezas', '2017-01-04 SO Diaria Piezas',
'2017-01-05 SO Diaria Piezas', '2017-01-06 SO Diaria Piezas',
'2017-01-07 SO Diaria Piezas', '2017-01-08 SO Diaria Piezas',
'2017-01-09 SO Diaria Piezas', '2017-01-10 SO Diaria Piezas',
'2017-01-11 SO Diaria Piezas', '2017-01-12 SO Diaria Piezas',
'2017-01-13 SO Diaria Piezas', '2017-01-14 SO Diaria Piezas',
'2017-01-15 SO Diaria Piezas', '2017-01-16 SO Diaria Piezas',
'2017-01-17 SO Diaria Piezas', '2017-01-18 SO Diaria Piezas',
'2017-01-19 SO Diaria Piezas', '2017-01-20 SO Diaria Piezas',
'2017-01-21 SO Diaria Piezas', '2017-01-22 SO Diaria Piezas',
'2017-01-23 SO Diaria Piezas', '2017-01-24 SO Diaria Piezas',
'2017-01-25 SO Diaria Piezas', '2017-01-26 SO Diaria Piezas',
'2017-01-27 SO Diaria Piezas', '2017-01-28 SO Diaria Piezas',
'2017-01-29 SO Diaria Piezas', '2017-01-30 SO Diaria Piezas',
'2017-01-31 SO Diaria Piezas', '2017-02-01 SO Diaria Piezas',
'2017-02-02 SO Diaria Piezas', '2017-02-03 SO Diaria Piezas',
'2017-02-04 SO Diaria Piezas', '2017-02-05 SO Diaria Piezas',
'2017-02-06 SO Diaria Piezas', '2017-02-07 SO Diaria Piezas',
'2017-02-08 SO Diaria Piezas', '2017-02-09 SO Diaria Piezas',
'2017-02-10 SO Diaria Piezas', '2017-02-11 SO Diaria Piezas',
'2017-02-12 SO Diaria Piezas', '2017-02-13 SO Diaria Piezas',
'2017-02-14 SO Diaria Piezas', '2017-02-15 SO Diaria Piezas',
'2017-02-16 SO Diaria Piezas', '2017-02-17 SO Diaria Piezas',
'2017-02-18 SO Diaria Piezas', '2017-02-19 SO Diaria Piezas',
'2017-02-20 SO Diaria Piezas', '2017-02-21 SO Diaria Piezas',
'2017-02-22 SO Diaria Piezas', '2017-02-23 SO Diaria Piezas',
'2017-02-24 SO Diaria Piezas', '2017-02-25 SO Diaria Piezas',
'2017-02-26 SO Diaria Piezas', '2017-02-27 SO Diaria Piezas',
'2017-02-28 SO Diaria Piezas', '2017-03-01 SO Diaria Piezas',
'2017-03-02 SO Diaria Piezas', '2017-03-03 SO Diaria Piezas',
'2017-03-04 SO Diaria Piezas', '2017-03-05 SO Diaria Piezas',
'2017-03-06 SO Diaria Piezas', '2017-03-07 SO Diaria Piezas',
'2017-03-08 SO Diaria Piezas', '2017-03-09 SO Diaria Piezas',
'2017-03-10 SO Diaria Piezas', '2017-03-11 SO Diaria Piezas',
'2017-03-12 SO Diaria Piezas', '2017-03-13 SO Diaria Piezas',
'2017-03-14 SO Diaria Piezas', '2017-03-15 SO Diaria Piezas',
'2017-03-16 SO Diaria Piezas', '2017-03-17 SO Diaria Piezas',
'2017-03-18 SO Diaria Piezas', '2017-03-19 SO Diaria Piezas',
'2017-03-20 SO Diaria Piezas', '2017-03-21 SO Diaria Piezas',
'2017-03-22 SO Diaria Piezas', '2017-03-23 SO Diaria Piezas',
'2017-03-24 SO Diaria Piezas', '2017-03-25 SO Diaria Piezas',
'2017-03-26 SO Diaria Piezas', '2017-03-27 SO Diaria Piezas',
'2017-03-28 SO Diaria Piezas', '2017-03-29 SO Diaria Piezas',
'2017-03-30 SO Diaria Piezas', '2017-03-31 SO Diaria Piezas',
'2017-04-01 SO Diaria Piezas', '2017-04-02 SO Diaria Piezas',
'2017-04-03 SO Diaria Piezas', '2017-04-04 SO Diaria Piezas',
'2017-04-05 SO Diaria Piezas', '2017-04-06 SO Diaria Piezas',
'2017-04-07 SO Diaria Piezas', '2017-04-08 SO Diaria Piezas',
'2017-04-09 SO Diaria Piezas', '2017-04-10 SO Diaria Piezas',
'2017-04-11 SO Diaria Piezas', '2017-04-12 SO Diaria Piezas',
'2017-04-13 SO Diaria Piezas', '2017-04-14 SO Diaria Piezas',
'2017-04-15 SO Diaria Piezas', '2017-04-16 SO Diaria Piezas',
'2017-04-17 SO Diaria Piezas', '2017-04-18 SO Diaria Piezas',
'2017-04-19 SO Diaria Piezas', '2017-04-20 SO Diaria Piezas',
'2017-04-21 SO Diaria Piezas', '2017-04-22 SO Diaria Piezas',
'2017-04-23 SO Diaria Piezas', '2017-04-24 SO Diaria Piezas',
'2017-04-25 SO Diaria Piezas', '2017-04-26 SO Diaria Piezas',
'2017-04-27 SO Diaria Piezas', '2017-04-28 SO Diaria Piezas',
'2017-04-29 SO Diaria Piezas', '2017-04-30 SO Diaria Piezas',
'2017-05-01 SO Diaria Piezas', '2017-05-02 SO Diaria Piezas',
'2017-05-03 SO Diaria Piezas', '2017-05-04 SO Diaria Piezas',
'2017-05-05 SO Diaria Piezas', '2017-05-06 SO Diaria Piezas',
'2017-05-07 SO Diaria Piezas', '2017-05-08 SO Diaria Piezas',
'2017-05-09 SO Diaria Piezas', '2017-05-10 SO Diaria Piezas',
'2017-05-11 SO Diaria Piezas', '2017-05-12 SO Diaria Piezas',
'2017-05-13 SO Diaria Piezas', '2017-05-14 SO Diaria Piezas',
'2017-05-15 SO Diaria Piezas', '2017-05-16 SO Diaria Piezas',
'2017-05-17 SO Diaria Piezas', '2017-05-18 SO Diaria Piezas',
'2017-05-19 SO Diaria Piezas', '2017-05-20 SO Diaria Piezas',
'2017-05-21 SO Diaria Piezas', '2017-05-22 SO Diaria Piezas',
'2017-05-23 SO Diaria Piezas', '2017-05-24 SO Diaria Piezas',
'2017-05-25 SO Diaria Piezas', '2017-05-26 SO Diaria Piezas',
'2017-05-27 SO Diaria Piezas', '2017-05-28 SO Diaria Piezas',
'2017-05-29 SO Diaria Piezas', '2017-05-30 SO Diaria Piezas',
'2017-05-31 SO Diaria Piezas', '2017-06-01 SO Diaria Piezas',
'2017-06-02 SO Diaria Piezas', '2017-06-03 SO Diaria Piezas',
'2017-06-04 SO Diaria Piezas'], dtype=object)
# Imported a function from src.utils.mining_data_tb to split the column and delete SO Diaria Pieza
split_column(df=sellout_prov1, column="Date", separation_object=" ", times=1)
# Checking if the split was done well
sellout_prov1.Date.unique()
array(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
'2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
'2017-01-09', '2017-01-10', '2017-01-11', '2017-01-12',
'2017-01-13', '2017-01-14', '2017-01-15', '2017-01-16',
'2017-01-17', '2017-01-18', '2017-01-19', '2017-01-20',
'2017-01-21', '2017-01-22', '2017-01-23', '2017-01-24',
'2017-01-25', '2017-01-26', '2017-01-27', '2017-01-28',
'2017-01-29', '2017-01-30', '2017-01-31', '2017-02-01',
'2017-02-02', '2017-02-03', '2017-02-04', '2017-02-05',
'2017-02-06', '2017-02-07', '2017-02-08', '2017-02-09',
'2017-02-10', '2017-02-11', '2017-02-12', '2017-02-13',
'2017-02-14', '2017-02-15', '2017-02-16', '2017-02-17',
'2017-02-18', '2017-02-19', '2017-02-20', '2017-02-21',
'2017-02-22', '2017-02-23', '2017-02-24', '2017-02-25',
'2017-02-26', '2017-02-27', '2017-02-28', '2017-03-01',
'2017-03-02', '2017-03-03', '2017-03-04', '2017-03-05',
'2017-03-06', '2017-03-07', '2017-03-08', '2017-03-09',
'2017-03-10', '2017-03-11', '2017-03-12', '2017-03-13',
'2017-03-14', '2017-03-15', '2017-03-16', '2017-03-17',
'2017-03-18', '2017-03-19', '2017-03-20', '2017-03-21',
'2017-03-22', '2017-03-23', '2017-03-24', '2017-03-25',
'2017-03-26', '2017-03-27', '2017-03-28', '2017-03-29',
'2017-03-30', '2017-03-31', '2017-04-01', '2017-04-02',
'2017-04-03', '2017-04-04', '2017-04-05', '2017-04-06',
'2017-04-07', '2017-04-08', '2017-04-09', '2017-04-10',
'2017-04-11', '2017-04-12', '2017-04-13', '2017-04-14',
'2017-04-15', '2017-04-16', '2017-04-17', '2017-04-18',
'2017-04-19', '2017-04-20', '2017-04-21', '2017-04-22',
'2017-04-23', '2017-04-24', '2017-04-25', '2017-04-26',
'2017-04-27', '2017-04-28', '2017-04-29', '2017-04-30',
'2017-05-01', '2017-05-02', '2017-05-03', '2017-05-04',
'2017-05-05', '2017-05-06', '2017-05-07', '2017-05-08',
'2017-05-09', '2017-05-10', '2017-05-11', '2017-05-12',
'2017-05-13', '2017-05-14', '2017-05-15', '2017-05-16',
'2017-05-17', '2017-05-18', '2017-05-19', '2017-05-20',
'2017-05-21', '2017-05-22', '2017-05-23', '2017-05-24',
'2017-05-25', '2017-05-26', '2017-05-27', '2017-05-28',
'2017-05-29', '2017-05-30', '2017-05-31', '2017-06-01',
'2017-06-02', '2017-06-03', '2017-06-04'], dtype=object)
# Imported the function from src.utils.mining_data_tb to change the date from object to datetime format
changing_dates(df=sellout_prov1, name_column="Date")
# Checking the change of the Dtype of Date column was done well
sellout_prov1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1206365 entries, 0 to 1206364 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Sku 1206365 non-null int64 1 Date 1206365 non-null datetime64[ns] 2 Store_Id 1206365 non-null int64 3 Units 1206365 non-null float64 4 Price 1206365 non-null float64 dtypes: datetime64[ns](1), float64(2), int64(2) memory usage: 46.0 MB
# A fundamental part is select the product from X Date to Y Date, so it's important to sort the DataFrame by Date
# Here we can see that have been sold product only in 2017
sellout_prov1 = sellout_prov1.sort_values("Date")
sellout_prov1
| Sku | Date | Store_Id | Units | Price | |
|---|---|---|---|---|---|
| 0 | 4220015 | 2017-01-01 | 18 | 6.00 | 15.600000 |
| 293184 | 4422187 | 2017-01-01 | 102 | 0.50 | 22.000000 |
| 293185 | 4422187 | 2017-01-01 | 105 | 1.00 | 22.000000 |
| 293186 | 4422187 | 2017-01-01 | 106 | 1.50 | 22.000000 |
| 293187 | 4422187 | 2017-01-01 | 107 | 0.00 | 0.000000 |
| ... | ... | ... | ... | ... | ... |
| 485316 | 4533931 | 2017-06-04 | 112 | 1.35 | 56.666667 |
| 485315 | 4533931 | 2017-06-04 | 111 | 4.50 | 56.666667 |
| 485314 | 4533931 | 2017-06-04 | 110 | 0.00 | 0.000000 |
| 485324 | 4533931 | 2017-06-04 | 121 | 6.30 | 56.666667 |
| 1206364 | 4829827 | 2017-06-04 | 709 | 0.00 | 0.000000 |
1206365 rows × 5 columns
# Deleting the units equal to 0 because they don't provide information
sellout_prov1 = sellout_prov1.loc[(sellout_prov1["Units"] != 0)]
sellout_prov1
| Sku | Date | Store_Id | Units | Price | |
|---|---|---|---|---|---|
| 0 | 4220015 | 2017-01-01 | 18 | 6.00 | 15.600000 |
| 293184 | 4422187 | 2017-01-01 | 102 | 0.50 | 22.000000 |
| 293185 | 4422187 | 2017-01-01 | 105 | 1.00 | 22.000000 |
| 293186 | 4422187 | 2017-01-01 | 106 | 1.50 | 22.000000 |
| 293192 | 4422187 | 2017-01-01 | 113 | 0.25 | 22.000000 |
| ... | ... | ... | ... | ... | ... |
| 485319 | 4533931 | 2017-06-04 | 116 | 6.30 | 56.666667 |
| 485318 | 4533931 | 2017-06-04 | 114 | 0.90 | 56.666667 |
| 485316 | 4533931 | 2017-06-04 | 112 | 1.35 | 56.666667 |
| 485315 | 4533931 | 2017-06-04 | 111 | 4.50 | 56.666667 |
| 485324 | 4533931 | 2017-06-04 | 121 | 6.30 | 56.666667 |
638363 rows × 5 columns
# Remembering how the dataframe is
sellout_prov2.head()
| Sku | Date | Store_Id | Pieces | Price | |
|---|---|---|---|---|---|
| 0 | 4240039 | 2016/01/01 | 1000 | 23.0 | $16.00 |
| 1 | 4240039 | 2016/01/01 | 1001 | 116.0 | $16.00 |
| 2 | 4240039 | 2016/01/01 | 1002 | 146.0 | $16.00 |
| 3 | 4240039 | 2016/01/01 | 1003 | 17.0 | $16.80 |
| 4 | 4240039 | 2016/01/01 | 1004 | 5.0 | $16.70 |
# Imported the function from src.utils.mining_data_tb to change the date from object to datetime format
changing_dates(df=sellout_prov2, name_column="Date")
# Checking if the format is the correct one
sellout_prov2.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4862026 entries, 0 to 4862028 Data columns (total 5 columns): # Column Dtype --- ------ ----- 0 Sku int64 1 Date datetime64[ns] 2 Store_Id int64 3 Pieces float64 4 Price object dtypes: datetime64[ns](1), float64(1), int64(2), object(1) memory usage: 222.6+ MB
# The Price column has elements that make it difficult to analyze, so I remove those elements and pass it to float format.
sellout_prov2['Price'] = sellout_prov2['Price'].str.replace('$', '').astype(float)
# Checking the change was done well
sellout_prov2.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4862026 entries, 0 to 4862028 Data columns (total 5 columns): # Column Dtype --- ------ ----- 0 Sku int64 1 Date datetime64[ns] 2 Store_Id int64 3 Pieces float64 4 Price float64 dtypes: datetime64[ns](1), float64(2), int64(2) memory usage: 222.6 MB
# Changing the name of the column Pieces to Units as the dataframe of the other supplier
sellout_prov2 = sellout_prov2.rename(columns={'Pieces': 'Units'})
sellout_prov2
| Sku | Date | Store_Id | Units | Price | |
|---|---|---|---|---|---|
| 0 | 4240039 | 2016-01-01 | 1000 | 23.0 | 16.00 |
| 1 | 4240039 | 2016-01-01 | 1001 | 116.0 | 16.00 |
| 2 | 4240039 | 2016-01-01 | 1002 | 146.0 | 16.00 |
| 3 | 4240039 | 2016-01-01 | 1003 | 17.0 | 16.80 |
| 4 | 4240039 | 2016-01-01 | 1004 | 5.0 | 16.70 |
| ... | ... | ... | ... | ... | ... |
| 4862024 | 4729193 | 2016-12-30 | 5827 | 14.0 | 17.90 |
| 4862025 | 4729193 | 2016-12-30 | 5850 | 3.0 | 17.90 |
| 4862026 | 4729193 | 2016-12-30 | 5851 | 3.0 | 16.00 |
| 4862027 | 4729193 | 2016-12-30 | 5855 | 21.0 | 16.86 |
| 4862028 | 4729193 | 2016-08-20 | 3766 | 12.0 | 13.60 |
4862026 rows × 5 columns
# A fundamental part is select the product from X Date to Y Date, so it's important to sort the DataFrame by Date
# Here we can see that have been sold product only in 2016
sellout_prov2 = sellout_prov2.sort_values("Date")
sellout_prov2
| Sku | Date | Store_Id | Units | Price | |
|---|---|---|---|---|---|
| 0 | 4240039 | 2016-01-01 | 1000 | 23.00 | 16.0 |
| 4013864 | 4625287 | 2016-01-01 | 3399 | 33.25 | 7.5 |
| 4013865 | 4625287 | 2016-01-01 | 3400 | 2.50 | 7.5 |
| 4013866 | 4625287 | 2016-01-01 | 3401 | 3.00 | 7.5 |
| 4013867 | 4625287 | 2016-01-01 | 3402 | 1.00 | 7.5 |
| ... | ... | ... | ... | ... | ... |
| 4063782 | 4625287 | 2016-12-30 | 3765 | 4.50 | 9.8 |
| 4063781 | 4625287 | 2016-12-30 | 3764 | 0.25 | 9.8 |
| 4063780 | 4625287 | 2016-12-30 | 3763 | 0.25 | 9.8 |
| 4063788 | 4625287 | 2016-12-30 | 3772 | 12.00 | 9.8 |
| 3307007 | 4543948 | 2016-12-30 | 5825 | 10.80 | 40.0 |
4862026 rows × 5 columns
# Deleting the units equal to 0 because they don't provide information
# For the second supplier there weren't any
sellout_prov2.loc[(sellout_prov2["Units"] == 0)]
| Sku | Date | Store_Id | Units | Price |
|---|
# Remembering how the dataframe is
maestro_prod.head()
| Sku | Sku_Description | Sku_Category | Sku_Subcategory | |
|---|---|---|---|---|
| 0 | 11058 | Frosted Mango | Sugar-free | Cereal bars |
| 1 | 29207 | Nut Cheerios Mango | Super-flavour | Cereal box |
| 2 | 38379 | MultiGrain Mango | Sugar-free | Cereal bars |
| 3 | 42671 | Crunch Mango | Sugar-free | Cereal bars |
| 4 | 54644 | Corn Mango | Sugar-free | Cereal bars |
# Checking if all the formats are correct
maestro_prod.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1242 entries, 0 to 1241 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Sku 1242 non-null int64 1 Sku_Description 1242 non-null object 2 Sku_Category 1242 non-null object 3 Sku_Subcategory 1242 non-null object dtypes: int64(1), object(3) memory usage: 38.9+ KB
# Remembering how the dataframe is
maestro_client.head()
| NUM TDA | NOMBRE DE TDA | CADENA | FORMATO | MUNICIPIO/CIUDAD | ESTADO | DIVISION | DEPOSITO / DISTRIBUIDOR | ESTRUCTURA | GRADO | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | Bloody Sea Green BookShop | proveedor_1 | Shop | Bizarre White Shop | Coalmar | SURESTE | Mogulval Corp. | DISTRIBUIDORES | VI |
| 1 | 4 | Funny Sky Blue GipsyTrade | proveedor_1 | Shop | Bizarre White Shop | Coalmar | SURESTE | Mogulval Corp. | DISTRIBUIDORES | VI |
| 2 | 5 | Rare Green Laundrette | proveedor_1 | Shop | Rare Orange Applepie | Coalmar | SURESTE | Eioval Aerospace | FORANEO | VI |
| 3 | 6 | Angry Indigo Center | proveedor_1 | Shop | Rare Orange Applepie | Coalmar | SURESTE | Eioval Aerospace | FORANEO | VI |
| 4 | 7 | Tall Violet Deli | proveedor_1 | Shop | Rare Orange Applepie | Coalmar | SURESTE | Eioval Aerospace | FORANEO | VI |
# Checking if all the formats are correct
maestro_client.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2452 entries, 0 to 2451 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 NUM TDA 2452 non-null int64 1 NOMBRE DE TDA 2452 non-null object 2 CADENA 2452 non-null object 3 FORMATO 2452 non-null object 4 MUNICIPIO/CIUDAD 2452 non-null object 5 ESTADO 2452 non-null object 6 DIVISION 2452 non-null object 7 DEPOSITO / DISTRIBUIDOR 2452 non-null object 8 ESTRUCTURA 2452 non-null object 9 GRADO 2452 non-null object dtypes: int64(1), object(9) memory usage: 191.7+ KB
# Changing the name of the columns to same language as the others DataFrames and same view, I mean change from uppercase to lowercase
maestro_client = maestro_client.rename(columns={'NUM TDA': 'Store_Id', 'NOMBRE DE TDA': 'Store_Name', 'CADENA': 'Supplier', 'FORMATO': 'Format', 'MUNICIPIO/CIUDAD': 'Town_or_City', 'ESTADO': 'State', 'DIVISION': 'Division', 'DEPOSITO / DISTRIBUIDOR': 'Distributor', 'ESTRUCTURA': 'Structure', 'GRADO': 'Grade'})
Now, I am going to merge sellout dataframes (of both suppliers) with maestro_prod (description of the product) and maestro_client (description of the store)
# First divide maestro_client in 2 dataframes: Supplier 1 and Supplier 2
maestro_client_prov1 = maestro_client.loc[maestro_client["Supplier"] == "proveedor_1"]
maestro_client_prov2 = maestro_client.loc[maestro_client["Supplier"] == "proveedor_2"]
# Merging sellout of first supplier with the store's description dataframe (maestro_client_prov1)
df_merge1 = sellout_prov1.merge(maestro_client_prov1)
# Merging sellout of second supplier with the store's description dataframe (maestro_client_prov2)
df_merge2 = sellout_prov2.merge(maestro_client_prov2)
Comparing the difference of rows to see if it is matched every item in the merge of both suppliers
## First supplier
# The result is a large number of no matches (products of sellout dataframe that doesn't match with any store)
sellout_prov1.shape[0] - df_merge1.shape[0]
10756
## Second supplier
# Here we can see that we have some shops that didn't buy anything
sellout_prov2.shape[0] - df_merge2.shape[0]
-323
Taking this into account, I compare if all the store_Id which have in products chart is in the list of stores chart.
# Passing the array of store_id to list
lista_prov1 = list(sellout_prov1.Store_Id.unique())
lista_prov2 = list(sellout_prov2.Store_Id.unique())
lista_maestro_prov1 = list(maestro_client_prov1.Store_Id.unique())
lista_maestro_prov2 = list(maestro_client_prov2.Store_Id.unique())
# Here I compare if the store id of products chart is in stores chart
# The result is 10 stores that we don't know their name and other information
for x in lista_prov1:
if x in lista_maestro_prov1:
continue
else:
print(x)
94 78 676 709 174 675 712 681
# Again with second supplier
# Everything is ok
for x in lista_prov2:
if x in lista_maestro_prov2:
continue
else:
print(x)
So we have two options:
1. Talk with the client and tell the problem to know which stores are these.
2. Continue ignoring these stores.
In this case, second option is the choice because I can't talk to the client.
After this, I checked if in store's chart there are any Id duplicated.
# In the case of the first supplier we have an Id duplicated
maestro_client_prov1[maestro_client_prov1.duplicated("Store_Id")]
| Store_Id | Store_Name | Supplier | Format | Town_or_City | State | Division | Distributor | Structure | Grade | |
|---|---|---|---|---|---|---|---|---|---|---|
| 235 | 143 | Empty Turquoise Baker | proveedor_1 | Shop | Clean Aqua Soil | Nabargus | OCCIDENTE | Colorvit Ifonforge Industries | FORANEO | III |
# Filtering for the Id duplicated to see how many stores have the same Id
maestro_client_prov1.loc[(maestro_client_prov1["Store_Id"] == 143),:]
| Store_Id | Store_Name | Supplier | Format | Town_or_City | State | Division | Distributor | Structure | Grade | |
|---|---|---|---|---|---|---|---|---|---|---|
| 115 | 143 | Empty Turquoise Baker | proveedor_1 | Shop | Clean Aqua Soil | Egusgul | NOROESTE | Gornvalval Industrail | DISTRIBUIDORES | IV |
| 235 | 143 | Empty Turquoise Baker | proveedor_1 | Shop | Clean Aqua Soil | Nabargus | OCCIDENTE | Colorvit Ifonforge Industries | FORANEO | III |
# Again for the second supplier
maestro_client_prov2[maestro_client_prov2.duplicated("Store_Id")]
| Store_Id | Store_Name | Supplier | Format | Town_or_City | State | Division | Distributor | Structure | Grade | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2422 | 2568 | Impressive Coral BigCom | proveedor_2 | Grocery | Legendary Azure Tree | Mobartho | PENÍNSULA | Valarar Toughworks | FORANEO | VI |
# Filtering for the Id duplicated to see how many stores have the same Id
maestro_client_prov2.loc[(maestro_client_prov2["Store_Id"] == 2568),:]
| Store_Id | Store_Name | Supplier | Format | Town_or_City | State | Division | Distributor | Structure | Grade | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2409 | 2568 | Impressive Coral BigCom | proveedor_2 | Grocery | First Yellow Tower | Mobartho | PENÍNSULA | Valarar Toughworks | FORANEO | VI |
| 2422 | 2568 | Impressive Coral BigCom | proveedor_2 | Grocery | Legendary Azure Tree | Mobartho | PENÍNSULA | Valarar Toughworks | FORANEO | VI |
Same Id different shops, we have three options:
1. Change the Store_Id for the correct Id, asking the client.
2. Change the Store_Id for other new Id.
3. Continue deleting the second one (235 in the index first supplier) (2422 in the index second supplier)
In this case, I choose deleting the second one because I can't talk to the client and If I put another Id this will continue generating an error because any product will not match with that Store_Id.
# Deleting the second store with same Id
maestro_client_prov1.drop([235], axis=0, inplace = True)
maestro_client_prov2.drop([2422], axis=0, inplace = True)
# Checking the deleting was done well for first supplier
maestro_client_prov1.loc[(maestro_client_prov1["Store_Id"] == 235),:]
| Store_Id | Store_Name | Supplier | Format | Town_or_City | State | Division | Distributor | Structure | Grade | |
|---|---|---|---|---|---|---|---|---|---|---|
| 146 | 235 | Empty White Florist | proveedor_1 | Shop | Funny Lemon Star | Magornmar | METRO-SUR | Ioalfio Ifonforge Industries | METRO | V |
# Checking the deleting was done well for second supplier
maestro_client_prov2.loc[(maestro_client_prov2["Store_Id"] == 2568),:]
| Store_Id | Store_Name | Supplier | Format | Town_or_City | State | Division | Distributor | Structure | Grade | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2409 | 2568 | Impressive Coral BigCom | proveedor_2 | Grocery | First Yellow Tower | Mobartho | PENÍNSULA | Valarar Toughworks | FORANEO | VI |
So now I concatenated the dataframes ignoring the problems.
df_merge_prov1 = sellout_prov1.merge(maestro_client_prov1)
df_merge_prov2 = sellout_prov2.merge(maestro_client_prov2)
df_finalmerge_prov = pd.concat([df_merge_prov1, df_merge_prov2])
df_finalmerge_prov
| Sku | Date | Store_Id | Units | Price | Store_Name | Supplier | Format | Town_or_City | State | Division | Distributor | Structure | Grade | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4220015 | 2017-01-01 | 18 | 6.00 | 15.6 | Singing Black Center | proveedor_1 | Shop | Evil White Baton | Alloralf | METRO-NORTE | Emolor Vocals | METRO | IV |
| 1 | 4422187 | 2017-01-01 | 18 | 1.25 | 22.0 | Singing Black Center | proveedor_1 | Shop | Evil White Baton | Alloralf | METRO-NORTE | Emolor Vocals | METRO | IV |
| 2 | 4412040 | 2017-01-01 | 18 | 9.00 | 22.0 | Singing Black Center | proveedor_1 | Shop | Evil White Baton | Alloralf | METRO-NORTE | Emolor Vocals | METRO | IV |
| 3 | 4523924 | 2017-01-01 | 18 | 5.00 | 54.0 | Singing Black Center | proveedor_1 | Shop | Evil White Baton | Alloralf | METRO-NORTE | Emolor Vocals | METRO | IV |
| 4 | 4442378 | 2017-01-01 | 18 | 75.00 | 12.9 | Singing Black Center | proveedor_1 | Shop | Evil White Baton | Alloralf | METRO-NORTE | Emolor Vocals | METRO | IV |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4862021 | 4709131 | 2016-12-30 | 1252 | 8.00 | 13.5 | Last Purple LunchHall | proveedor_2 | TinyShop | Evil Pink Vessel | Coalmar | SURESTE | Valrodgus Vocals | DISTRIBUIDORES | IV |
| 4862022 | 4330800 | 2016-12-30 | 1252 | 3.00 | 16.8 | Last Purple LunchHall | proveedor_2 | TinyShop | Evil Pink Vessel | Coalmar | SURESTE | Valrodgus Vocals | DISTRIBUIDORES | IV |
| 4862023 | 4300343 | 2016-12-30 | 1252 | 7.20 | 40.0 | Last Purple LunchHall | proveedor_2 | TinyShop | Evil Pink Vessel | Coalmar | SURESTE | Valrodgus Vocals | DISTRIBUIDORES | IV |
| 4862024 | 4442378 | 2016-12-30 | 1252 | 91.00 | 13.4 | Last Purple LunchHall | proveedor_2 | TinyShop | Evil Pink Vessel | Coalmar | SURESTE | Valrodgus Vocals | DISTRIBUIDORES | IV |
| 4862025 | 4625287 | 2016-12-30 | 1252 | 31.75 | 9.8 | Last Purple LunchHall | proveedor_2 | TinyShop | Evil Pink Vessel | Coalmar | SURESTE | Valrodgus Vocals | DISTRIBUIDORES | IV |
5487915 rows × 14 columns
# Checking the concat was done correctly - same number of rows
df_merge_prov1.shape[0] + df_merge_prov2.shape[0]
5487915
Now, I did a checkup to know if all the sku that I have in store's description dataframe are in the description product dataframe.
# Here I did the list of sku of both dataframe
lista_finalmerge_sku_prov = list(df_finalmerge_prov.Sku.unique())
lista_maestro_prod = list(maestro_prod.Sku.unique())
# Here I compare both list, result: 1 product didn't identificate
for x in lista_finalmerge_sku_prov:
if x in lista_maestro_prod:
continue
else:
print(x)
Again we need to know this product, invent it or ignore it.
In this case, I'm going to ignore it because I can't know it asking the client and if I invent it this will return a non-real result.
# So now, I'm going to merge with maesto_prod (description products dataframe) to have the final sellout dataframe
df_sellout_final = df_finalmerge_prov.merge(maestro_prod)
df_sellout_final
| Sku | Date | Store_Id | Units | Price | Store_Name | Supplier | Format | Town_or_City | State | Division | Distributor | Structure | Grade | Sku_Description | Sku_Category | Sku_Subcategory | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4220015 | 2017-01-01 | 18 | 6.0 | 15.6 | Singing Black Center | proveedor_1 | Shop | Evil White Baton | Alloralf | METRO-NORTE | Emolor Vocals | METRO | IV | Stars Chocolate | Sugar-free | Cereal bars |
| 1 | 4220015 | 2017-01-02 | 18 | 15.0 | 15.6 | Singing Black Center | proveedor_1 | Shop | Evil White Baton | Alloralf | METRO-NORTE | Emolor Vocals | METRO | IV | Stars Chocolate | Sugar-free | Cereal bars |
| 2 | 4220015 | 2017-01-03 | 18 | 10.0 | 15.6 | Singing Black Center | proveedor_1 | Shop | Evil White Baton | Alloralf | METRO-NORTE | Emolor Vocals | METRO | IV | Stars Chocolate | Sugar-free | Cereal bars |
| 3 | 4220015 | 2017-01-04 | 18 | 8.0 | 15.6 | Singing Black Center | proveedor_1 | Shop | Evil White Baton | Alloralf | METRO-NORTE | Emolor Vocals | METRO | IV | Stars Chocolate | Sugar-free | Cereal bars |
| 4 | 4220015 | 2017-01-05 | 18 | 12.0 | 15.6 | Singing Black Center | proveedor_1 | Shop | Evil White Baton | Alloralf | METRO-NORTE | Emolor Vocals | METRO | IV | Stars Chocolate | Sugar-free | Cereal bars |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5487910 | 4371364 | 2016-07-20 | 3851 | 4.0 | 185.0 | Evil Sky Blue TallMarket | proveedor_2 | Leftorium | Enlightened Dark Empire | Magornmar | METRO-SUR | Thrilthoal Empire | METRO | V | Peanut Butter Mango Peach | Super-flavour | Cereal box |
| 5487911 | 4371364 | 2016-07-21 | 3851 | 4.0 | 185.0 | Evil Sky Blue TallMarket | proveedor_2 | Leftorium | Enlightened Dark Empire | Magornmar | METRO-SUR | Thrilthoal Empire | METRO | V | Peanut Butter Mango Peach | Super-flavour | Cereal box |
| 5487912 | 4371364 | 2016-07-23 | 3851 | 8.0 | 185.0 | Evil Sky Blue TallMarket | proveedor_2 | Leftorium | Enlightened Dark Empire | Magornmar | METRO-SUR | Thrilthoal Empire | METRO | V | Peanut Butter Mango Peach | Super-flavour | Cereal box |
| 5487913 | 4371364 | 2016-07-26 | 3851 | 4.0 | 185.0 | Evil Sky Blue TallMarket | proveedor_2 | Leftorium | Enlightened Dark Empire | Magornmar | METRO-SUR | Thrilthoal Empire | METRO | V | Peanut Butter Mango Peach | Super-flavour | Cereal box |
| 5487914 | 4371364 | 2016-07-28 | 3851 | 16.0 | 185.0 | Evil Sky Blue TallMarket | proveedor_2 | Leftorium | Enlightened Dark Empire | Magornmar | METRO-SUR | Thrilthoal Empire | METRO | V | Peanut Butter Mango Peach | Super-flavour | Cereal box |
5487915 rows × 17 columns
Now, I compared if the merge was done well.
# For this, I check how many rows we have with the Sku that we dont know (5027) and then I compared the rows of both dataframes to check if it is the same number.
df_finalmerge_prov.loc[(df_finalmerge_prov["Sku"] == 5027), :].shape[0]
0
# Yes, the difference is the same number so the merge was done well
df_finalmerge_prov.shape[0] - df_sellout_final.shape[0]
0
Now, I am going to change the order of the columns so the dataframe makes more sense.
# First, passing the columns to list of columns
cols_sellout = df_sellout_final.columns.tolist()
cols_sellout
['Sku', 'Date', 'Store_Id', 'Units', 'Price', 'Store_Name', 'Supplier', 'Format', 'Town_or_City', 'State', 'Division', 'Distributor', 'Structure', 'Grade', 'Sku_Description', 'Sku_Category', 'Sku_Subcategory']
# Second, sorting the columns with a new order
cols_sellout = cols_sellout[1:2] + cols_sellout[0:1] + cols_sellout[14:-1] + cols_sellout[3:5] + cols_sellout[2:3] + cols_sellout[5:14]
print(cols_sellout)
['Date', 'Sku', 'Sku_Description', 'Sku_Category', 'Units', 'Price', 'Store_Id', 'Store_Name', 'Supplier', 'Format', 'Town_or_City', 'State', 'Division', 'Distributor', 'Structure', 'Grade']
# Finally, setting the dataframe with the new order
df_sellout_final = df_sellout_final[cols_sellout]
df_sellout_final
| Date | Sku | Sku_Description | Sku_Category | Units | Price | Store_Id | Store_Name | Supplier | Format | Town_or_City | State | Division | Distributor | Structure | Grade | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2017-01-01 | 4220015 | Stars Chocolate | Sugar-free | 6.0 | 15.6 | 18 | Singing Black Center | proveedor_1 | Shop | Evil White Baton | Alloralf | METRO-NORTE | Emolor Vocals | METRO | IV |
| 1 | 2017-01-02 | 4220015 | Stars Chocolate | Sugar-free | 15.0 | 15.6 | 18 | Singing Black Center | proveedor_1 | Shop | Evil White Baton | Alloralf | METRO-NORTE | Emolor Vocals | METRO | IV |
| 2 | 2017-01-03 | 4220015 | Stars Chocolate | Sugar-free | 10.0 | 15.6 | 18 | Singing Black Center | proveedor_1 | Shop | Evil White Baton | Alloralf | METRO-NORTE | Emolor Vocals | METRO | IV |
| 3 | 2017-01-04 | 4220015 | Stars Chocolate | Sugar-free | 8.0 | 15.6 | 18 | Singing Black Center | proveedor_1 | Shop | Evil White Baton | Alloralf | METRO-NORTE | Emolor Vocals | METRO | IV |
| 4 | 2017-01-05 | 4220015 | Stars Chocolate | Sugar-free | 12.0 | 15.6 | 18 | Singing Black Center | proveedor_1 | Shop | Evil White Baton | Alloralf | METRO-NORTE | Emolor Vocals | METRO | IV |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5487910 | 2016-07-20 | 4371364 | Peanut Butter Mango Peach | Super-flavour | 4.0 | 185.0 | 3851 | Evil Sky Blue TallMarket | proveedor_2 | Leftorium | Enlightened Dark Empire | Magornmar | METRO-SUR | Thrilthoal Empire | METRO | V |
| 5487911 | 2016-07-21 | 4371364 | Peanut Butter Mango Peach | Super-flavour | 4.0 | 185.0 | 3851 | Evil Sky Blue TallMarket | proveedor_2 | Leftorium | Enlightened Dark Empire | Magornmar | METRO-SUR | Thrilthoal Empire | METRO | V |
| 5487912 | 2016-07-23 | 4371364 | Peanut Butter Mango Peach | Super-flavour | 8.0 | 185.0 | 3851 | Evil Sky Blue TallMarket | proveedor_2 | Leftorium | Enlightened Dark Empire | Magornmar | METRO-SUR | Thrilthoal Empire | METRO | V |
| 5487913 | 2016-07-26 | 4371364 | Peanut Butter Mango Peach | Super-flavour | 4.0 | 185.0 | 3851 | Evil Sky Blue TallMarket | proveedor_2 | Leftorium | Enlightened Dark Empire | Magornmar | METRO-SUR | Thrilthoal Empire | METRO | V |
| 5487914 | 2016-07-28 | 4371364 | Peanut Butter Mango Peach | Super-flavour | 16.0 | 185.0 | 3851 | Evil Sky Blue TallMarket | proveedor_2 | Leftorium | Enlightened Dark Empire | Magornmar | METRO-SUR | Thrilthoal Empire | METRO | V |
5487915 rows × 16 columns
First, I'm going to compare if all the sku of sellin dataframe are in maestro_prod (description product dataframe)
# Passing the array of store_id to list
lista_sellin_sku = list(sellin.Sku.unique())
lista_maestroprod_sku = list(maestro_prod.Sku.unique())
# Here I compare if the Sku of sellin dataframe is in maestro_prod (description product dataframe)
# The result: all the sku of sellin are in maestro_prod dataframe
for x in lista_sellin_sku:
if x in lista_maestroprod_sku:
continue
else:
print(x)
# Everything is correct so it's time to merge sellin with maestro_prod
df_sellin = sellin.merge(maestro_prod)
df_sellin
| Sku | Date | Store_Id | Units | Price | Margin | Sku_Description | Sku_Category | Sku_Subcategory | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 4361050 | 2015-01-04 | 18 | 15.12 | 12.99 | 10.67 | Sugar Mango Peach | Sugar-free | Cereal bars |
| 1 | 4361050 | 2015-01-04 | 60 | 15.12 | 13.06 | 10.74 | Sugar Mango Peach | Sugar-free | Cereal bars |
| 2 | 4361050 | 2015-01-04 | 3504 | 30.24 | 12.53 | 10.22 | Sugar Mango Peach | Sugar-free | Cereal bars |
| 3 | 4361050 | 2015-01-04 | 3636 | 15.12 | 12.36 | 10.04 | Sugar Mango Peach | Sugar-free | Cereal bars |
| 4 | 4361050 | 2015-01-04 | 3725 | 45.36 | 12.53 | 10.22 | Sugar Mango Peach | Sugar-free | Cereal bars |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4141813 | 4749421 | 2017-12-05 | 1108 | 10.80 | 46.36 | 38.55 | Crispy Rice Chocolate Choc Chip | Super-flavour | Cereal box |
| 4141814 | 4749421 | 2017-12-05 | 1083 | 10.80 | 46.36 | 38.55 | Crispy Rice Chocolate Choc Chip | Super-flavour | Cereal box |
| 4141815 | 4749421 | 2017-12-05 | 1015 | 21.60 | 46.36 | 38.55 | Crispy Rice Chocolate Choc Chip | Super-flavour | Cereal box |
| 4141816 | 4749421 | 2017-12-05 | 232 | 5.40 | 46.96 | 39.15 | Crispy Rice Chocolate Choc Chip | Super-flavour | Cereal box |
| 4141817 | 4749421 | 2017-12-05 | 108 | 5.40 | 46.96 | 39.15 | Crispy Rice Chocolate Choc Chip | Super-flavour | Cereal box |
4141818 rows × 9 columns
#Checking the merge was done well
df_sellin.shape[0] - sellin.shape[0]
0
In sellin dataframe I didn't have the information of supplier but I remembered that:
So, in this case, I merged by date ignoring the data from 2015 because I assumed that supplier 1 only bought in 2017 and supplier 2 only bought in 2016.
# Recovering the two dataframes divided by supplier.
maestro_client_prov1 = maestro_client.loc[maestro_client["Supplier"] == "proveedor_1"]
maestro_client_prov2 = maestro_client.loc[maestro_client["Supplier"] == "proveedor_2"]
# Doing a new column with the year
df_sellin['Year'] = df_sellin['Date'].dt.year
df_sellin.head()
| Sku | Date | Store_Id | Units | Price | Margin | Sku_Description | Sku_Category | Sku_Subcategory | Year | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4361050 | 2015-01-04 | 18 | 15.12 | 12.99 | 10.67 | Sugar Mango Peach | Sugar-free | Cereal bars | 2015 |
| 1 | 4361050 | 2015-01-04 | 60 | 15.12 | 13.06 | 10.74 | Sugar Mango Peach | Sugar-free | Cereal bars | 2015 |
| 2 | 4361050 | 2015-01-04 | 3504 | 30.24 | 12.53 | 10.22 | Sugar Mango Peach | Sugar-free | Cereal bars | 2015 |
| 3 | 4361050 | 2015-01-04 | 3636 | 15.12 | 12.36 | 10.04 | Sugar Mango Peach | Sugar-free | Cereal bars | 2015 |
| 4 | 4361050 | 2015-01-04 | 3725 | 45.36 | 12.53 | 10.22 | Sugar Mango Peach | Sugar-free | Cereal bars | 2015 |
# Separating sellin into 2 dataframe by Year
sellin_2017 = df_sellin.loc[(df_sellin["Year"] == 2017), :]
sellin_2016 = df_sellin.loc[(df_sellin["Year"] == 2016), :]
# Merging by year-supplier
df_merge_sellprov1 = sellin_2017.merge(maestro_client_prov1, on="Store_Id") # First supplier - Year 2017
df_merge_sellprov2 = sellin_2016.merge(maestro_client_prov2, on="Store_Id") # Second supplier - Year 2016
# Concatenating both suppliers
df_sellin_final = pd.concat([df_merge_sellprov1, df_merge_sellprov2])
# Sorting the dataframe by Date
df_sellin_final = df_sellin_final.sort_values("Date").reset_index(drop=True)
# Deleting Year column
df_sellin_final = df_sellin_final.drop(['Year'], axis=1)
Now, I am going to change the order of the columns so the dataframe makes more sense.
# First, passing the columns to list of columns
cols_sellin = df_sellin_final.columns.tolist()
cols_sellin
['Sku', 'Date', 'Store_Id', 'Units', 'Price', 'Margin', 'Sku_Description', 'Sku_Category', 'Sku_Subcategory', 'Store_Name', 'Supplier', 'Format', 'Town_or_City', 'State', 'Division', 'Distributor', 'Structure', 'Grade']
# Second, sorting the columns with a new order
cols_sellin = cols_sellin[1:2] + cols_sellin[0:1] + cols_sellin[6:9] + cols_sellin[3:6] + cols_sellin[2:3] + cols_sellin[10:-1]
print(cols_sellin)
['Date', 'Sku', 'Sku_Description', 'Sku_Category', 'Sku_Subcategory', 'Units', 'Price', 'Margin', 'Store_Id', 'Supplier', 'Format', 'Town_or_City', 'State', 'Division', 'Distributor', 'Structure']
# Finally, setting the dataframe with the new order
df_sellin_final = df_sellin_final[cols_sellin]
df_sellin_final
| Date | Sku | Sku_Description | Sku_Category | Sku_Subcategory | Units | Price | Margin | Store_Id | Supplier | Format | Town_or_City | State | Division | Distributor | Structure | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2016-01-02 | 4240039 | Lucjy Charms Chocolate | Sugar-free | Cereal bars | 48.00 | 13.74 | 9.46 | 1668 | proveedor_2 | Grocery | Enlightened Dark Empire | Magornmar | METRO-SUR | Ioalfio Ifonforge Industries | METRO |
| 1 | 2016-01-02 | 4330800 | Crunch Mango Peach | Sugar-free | Cereal bars | 84.00 | 14.01 | 9.41 | 3826 | proveedor_2 | GipsyTrade | Last Pink River | Alloralf | METRO-NORTE | Emolor Vocals | METRO |
| 2 | 2016-01-02 | 4769520 | Cornflakes Chocolate Choc Chip | Super-flavour | Cereal box | 14.40 | 35.54 | 26.49 | 3372 | proveedor_2 | Grocery | Angry Navy Empire | Alloralf | METRO-NORTE | Lorthogus Toughworks | METRO |
| 3 | 2016-01-02 | 4442378 | Crunchy Bran Mango Peach | Sugar-free | Cereal bars | 24.00 | 11.09 | 5.51 | 2217 | proveedor_2 | Grocery | Angry Navy Empire | Alloralf | METRO-NORTE | Lorthogus Toughworks | METRO |
| 4 | 2016-01-02 | 4779537 | Crunchy Bran Chocolate Choc Chip | Super-flavour | Cereal box | 21.60 | 35.37 | 27.41 | 1915 | proveedor_2 | Grocery | Great Violet Rock | Alloralf | METRO-NORTE | Lorthogus Toughworks | METRO |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1636361 | 2017-12-05 | 4381425 | Chocos Mango Peach | Sugar-free | Cereal bars | 1200.00 | 16.23 | 11.66 | 606 | proveedor_1 | Shop | Evil Orange Baton | Coalmar | SURESTE | Eioval Aerospace | FORANEO |
| 1636362 | 2017-12-05 | 4719179 | Chocos Chocolate Choc Chip | Sugar-free | Cereal bars | 132.00 | 14.04 | 7.41 | 612 | proveedor_1 | GreatShop | Tall Aquamarine Rock | Arrodio | SURESTE | Mararna Industrail | DISTRIBUIDORES |
| 1636363 | 2017-12-05 | 4819810 | Fruit & Nut Chocolate Choc Chip | Sugar-free | Cereal bars | 33.75 | 21.92 | 16.02 | 606 | proveedor_1 | Shop | Evil Orange Baton | Coalmar | SURESTE | Eioval Aerospace | FORANEO |
| 1636364 | 2017-12-05 | 4230022 | Crunch Life Chocolate | Sugar-free | Cereal bars | 20.00 | 14.93 | 10.85 | 240 | proveedor_1 | Shop | Rare Lemon Key | Alloralf | METRO-SUR | Thrilthoal Empire | METRO |
| 1636365 | 2017-12-05 | 4625287 | Toast Mango Peach | Super-flavour | Cereal box | 6.75 | 36.80 | 27.14 | 9 | proveedor_1 | Shop | Kicking Crimson Key | Coalmar | SURESTE | Arbarrus Gmbh | DISTRIBUIDORES |
1636366 rows × 16 columns
profile_sellin = ProfileReport(df_sellin_final, explorative=True)
profile_sellin
Summarize dataset: 100%|██████████| 29/29 [02:31<00:00, 5.21s/it, Completed] Generate report structure: 100%|██████████| 1/1 [00:06<00:00, 6.77s/it] Render HTML: 100%|██████████| 1/1 [00:01<00:00, 1.81s/it]
profile_sellout = ProfileReport(df_sellout_final, explorative=True)
profile_sellout
Summarize dataset: 100%|██████████| 29/29 [07:58<00:00, 16.50s/it, Completed] Generate report structure: 100%|██████████| 1/1 [00:06<00:00, 6.84s/it] Render HTML: 100%|██████████| 1/1 [00:01<00:00, 1.22s/it]
Questions about:
From manufacturer to supplier (sellin)
groupby_skudesc_in = df_sellin_final.groupby("Sku_Description").sum("Units").sort_values("Units", ascending=False).head()
groupby_skudesc_in
| Sku | Units | Price | Margin | Store_Id | |
|---|---|---|---|---|---|
| Sku_Description | |||||
| Crunchy Bran Mango Peach | 444597632618 | 36566700.0 | 1.127846e+06 | 5.684780e+05 | 269065546 |
| Chocos Mango Peach | 463616104950 | 30032922.0 | 1.499999e+06 | 1.038980e+06 | 290258698 |
| Lucjy Charms Chocolate | 413399562461 | 22741251.0 | 1.375978e+06 | 9.576922e+05 | 270075652 |
| Stars Mango Peach | 393282127051 | 14464146.0 | 1.227939e+06 | 8.516122e+05 | 241487936 |
| Muesli Chocolate | 347611262340 | 12033058.0 | 1.156769e+06 | 8.058198e+05 | 227435093 |
Total_sum_in = df_sellin_final.Units.sum()
Total_sum_in
163644362.10000002
Crunchy_Bran = groupby_skudesc_in.Units[0]
Chocos_Mango = groupby_skudesc_in.Units[1]
Lucjy_Charms = groupby_skudesc_in.Units[2]
Stars_Mango = groupby_skudesc_in.Units[3]
Muesli_choco = groupby_skudesc_in.Units[4]
Rest = Total_sum_in - Crunchy_Bran - Chocos_Mango - Lucjy_Charms - Stars_Mango - Muesli_choco
labels = ["Crunchy Bran Mango Peach", "Chocos Mango Peach", "Lucjy Charms Chocolate", "Stars Mango Peach", "Muesli Chocolate", "Others"]
piechart_figure_6labels(x1 = Crunchy_Bran, x2 = Chocos_Mango, x3 = Lucjy_Charms, x4 = Stars_Mango, x5 = Muesli_choco, x6 = Rest, labels = labels, rooth_path = rooth_path, name_fig="\\sellin_sku_units.png")
From supplier to final user (sellout)
groupby_skudesc_out = df_sellout_final.groupby("Sku_Description").sum("Units").sort_values("Units", ascending=False).head()
groupby_skudesc_out
| Sku | Units | Price | Store_Id | |
|---|---|---|---|---|
| Sku_Description | ||||
| Crunchy Bran Mango Peach | 1890129664306 | 29540217.0 | 5.307024e+06 | 1110643675 |
| Chocos Mango Peach | 1844698223475 | 22033855.0 | 7.313245e+06 | 1099968676 |
| Lucjy Charms Chocolate | 1596743566893 | 16293766.0 | 6.068939e+06 | 978365853 |
| Stars Mango Peach | 1685083005573 | 10394099.0 | 6.421853e+06 | 963561595 |
| Muesli Chocolate | 1379968685970 | 9279243.0 | 5.227442e+06 | 834980485 |
Total_sum_out = df_sellout_final.Units.sum()
Total_sum_out
116087604.56999998
Crunchy_Bran = groupby_skudesc_out.Units[0]
Chocos_Mango = groupby_skudesc_out.Units[1]
Lucjy_Charms = groupby_skudesc_out.Units[2]
Stars_Mango = groupby_skudesc_out.Units[3]
Muesli_choco = groupby_skudesc_out.Units[4]
Rest = Total_sum_out - Crunchy_Bran - Chocos_Mango - Lucjy_Charms - Stars_Mango - Muesli_choco
labels = ["Crunchy Bran Mango Peach", "Chocos Mango Peach", "Lucjy Charms Chocolate", "Stars Mango Peach", "Muesli Chocolate", "Others"]
piechart_figure_6labels(x1 = Crunchy_Bran, x2 = Chocos_Mango, x3 = Lucjy_Charms, x4 = Stars_Mango, x5 = Muesli_choco, x6 = Rest, labels = labels, rooth_path = rooth_path, name_fig="\\sellout_sku_units.png")
groupby_state_in = df_sellin_final.groupby("State").sum("Units").sort_values("Units", ascending=False).head()
groupby_state_in
| Sku | Units | Price | Margin | Store_Id | |
|---|---|---|---|---|---|
| State | |||||
| Magornmar | 1973759209602 | 4.565979e+07 | 9.598095e+06 | 6.928638e+06 | 1294553262 |
| Alloralf | 2412775932836 | 4.100110e+07 | 1.179025e+07 | 8.517924e+06 | 1464553008 |
| Coalmar | 212894785946 | 1.073659e+07 | 9.533726e+05 | 6.710992e+05 | 107749039 |
| Thrilvallor | 319946309797 | 9.595306e+06 | 1.557989e+06 | 1.122465e+06 | 186706861 |
| Lorvalmo | 317491944331 | 5.884945e+06 | 1.507395e+06 | 1.088734e+06 | 216182129 |
x = groupby_state_in.index
y = groupby_state_in.Units
histogram_figure(x = x, y = y, x_label = "State which sells more", y_label = "Units from 2016 to 2017 included", rooth_path = rooth_path, name_fig="\\sellin_state_units.png")
groupby_state_out = df_sellout_final.groupby("State").sum("Units").sort_values("Units", ascending=False).head()
groupby_state_out
| Sku | Units | Price | Store_Id | |
|---|---|---|---|---|
| State | ||||
| Magornmar | 4209177916411 | 2.875981e+07 | 1.706816e+07 | 2580483506 |
| Alloralf | 5639320077513 | 2.684519e+07 | 2.226254e+07 | 3283126469 |
| Coalmar | 1415816283428 | 9.812334e+06 | 5.723446e+06 | 628380956 |
| Thrilvallor | 1359240603107 | 7.110587e+06 | 5.250879e+06 | 730568701 |
| Arrodio | 456203932115 | 4.581163e+06 | 1.893339e+06 | 175801043 |
x = groupby_state_out.index
y = groupby_state_out.Units
histogram_figure(x = x, y = y, x_label = "State which sells more", y_label = "Units from 2016 to 2017 included", rooth_path = rooth_path, name_fig="\\sellout_state_units.png")
From manufacturer to supplier (sellin)
groupby_date_in = df_sellin_final.groupby(pd.Grouper(key='Date', freq='M')).sum("Units").sort_values("Date", ascending=False)
groupby_date_in
| Sku | Units | Price | Margin | Store_Id | |
|---|---|---|---|---|---|
| Date | |||||
| 2017-12-31 | 9168396779 | 4.159354e+05 | 5.079074e+04 | 3.734948e+04 | 526270 |
| 2017-11-30 | 15763219020 | 7.391822e+05 | 8.415861e+04 | 6.174952e+04 | 778595 |
| 2017-10-31 | 14588625327 | 7.013559e+05 | 7.589648e+04 | 5.536795e+04 | 840030 |
| 2017-09-30 | 12690328078 | 6.920469e+05 | 6.451575e+04 | 4.699136e+04 | 624521 |
| 2017-08-31 | 12514016439 | 6.539557e+05 | 6.556072e+04 | 4.805520e+04 | 666102 |
| 2017-07-31 | 14328346453 | 7.435844e+05 | 7.078465e+04 | 5.129925e+04 | 757237 |
| 2017-06-30 | 14121651266 | 6.160464e+05 | 7.520529e+04 | 5.522697e+04 | 785336 |
| 2017-05-31 | 60706792432 | 2.786083e+06 | 3.323382e+05 | 2.449176e+05 | 3242997 |
| 2017-04-30 | 60854617213 | 2.873729e+06 | 3.349229e+05 | 2.472771e+05 | 3143986 |
| 2017-03-31 | 56801328050 | 2.687905e+06 | 2.931848e+05 | 2.149726e+05 | 3108537 |
| 2017-02-28 | 52076893858 | 2.473789e+06 | 2.693417e+05 | 1.976546e+05 | 2752742 |
| 2017-01-31 | 61061599923 | 2.483090e+06 | 3.101233e+05 | 2.242064e+05 | 3283342 |
| 2016-12-31 | 616140292802 | 1.257147e+07 | 3.025843e+06 | 2.202930e+06 | 400246629 |
| 2016-11-30 | 558344250373 | 1.170258e+07 | 2.759645e+06 | 2.005324e+06 | 362529417 |
| 2016-10-31 | 587518273281 | 1.205778e+07 | 2.862958e+06 | 2.063200e+06 | 382490655 |
| 2016-09-30 | 605093745350 | 1.300326e+07 | 2.894926e+06 | 2.081895e+06 | 393027955 |
| 2016-08-31 | 608867363957 | 1.331020e+07 | 2.951609e+06 | 2.123993e+06 | 397170324 |
| 2016-07-31 | 580729602590 | 1.249658e+07 | 2.806169e+06 | 2.009365e+06 | 377250075 |
| 2016-06-30 | 571320359697 | 1.230265e+07 | 2.792318e+06 | 2.004047e+06 | 372508008 |
| 2016-05-31 | 619991911693 | 1.314276e+07 | 3.020501e+06 | 2.168794e+06 | 403915095 |
| 2016-04-30 | 608744221731 | 1.237674e+07 | 2.956126e+06 | 2.120196e+06 | 395643821 |
| 2016-03-31 | 553380799540 | 1.103531e+07 | 2.716348e+06 | 1.953547e+06 | 359812276 |
| 2016-02-29 | 542160080529 | 1.117074e+07 | 2.586423e+06 | 1.854979e+06 | 353159130 |
| 2016-01-31 | 547389421707 | 1.060760e+07 | 2.653974e+06 | 1.905024e+06 | 356858213 |
x = groupby_date_in.index
y = groupby_date_in.Units
linediagram_figure(x = x, y = y, x_label = "State which sells more", y_label = "Units from 2016 to 2017 included", rooth_path = rooth_path, name_fig="\\sellin_date_units.png")
From supplier to final user (sellout)
groupby_date_out = df_sellout_final.groupby(pd.Grouper(key='Date', freq='M')).sum("Units").sort_values("Date", ascending=True)
groupby_date_out
| Sku | Units | Price | Store_Id | |
|---|---|---|---|---|
| Date | ||||
| 2016-01-31 | 2586627104720 | 1.153033e+07 | 9.270365e+06 | 1675614957 |
| 2016-02-29 | 2528140100774 | 1.175701e+07 | 9.161127e+06 | 1635886854 |
| 2016-03-31 | 2707407321623 | 1.188439e+07 | 9.848261e+06 | 1751512781 |
| 2016-04-30 | 2548253029135 | 1.142255e+07 | 9.205185e+06 | 1648321856 |
| 2016-05-31 | 0 | 0.000000e+00 | 0.000000e+00 | 0 |
| 2016-06-30 | 0 | 0.000000e+00 | 0.000000e+00 | 0 |
| 2016-07-31 | 2696057269920 | 1.226700e+07 | 9.865532e+06 | 1742448065 |
| 2016-08-31 | 2635214976726 | 1.223977e+07 | 9.618069e+06 | 1698137710 |
| 2016-09-30 | 0 | 0.000000e+00 | 0.000000e+00 | 0 |
| 2016-10-31 | 296779199880 | 1.838804e+06 | 1.086698e+06 | 191070075 |
| 2016-11-30 | 2802799439513 | 1.230504e+07 | 1.055507e+07 | 1804642150 |
| 2016-12-31 | 2793971104014 | 1.252113e+07 | 1.051159e+07 | 1798159164 |
| 2017-01-31 | 557587095272 | 3.629245e+06 | 3.089747e+06 | 29929443 |
| 2017-02-28 | 523060711750 | 3.222012e+06 | 2.961250e+06 | 27985376 |
| 2017-03-31 | 556307474190 | 3.353358e+06 | 3.372731e+06 | 29836223 |
| 2017-04-30 | 551564945145 | 3.643755e+06 | 3.430129e+06 | 29602526 |
| 2017-05-31 | 573532181030 | 3.819300e+06 | 3.563815e+06 | 30881680 |
| 2017-06-30 | 75683054443 | 6.539072e+05 | 4.746754e+05 | 4066927 |
x = groupby_date_out.index
y = groupby_date_out.Units
linediagram_figure(x = x, y = y, x_label = "State which sells more", y_label = "Units from 2016 to 2017 included", rooth_path = rooth_path, name_fig="\\sellout_date_units.png")
groupby_supplier_in = df_sellin_final.groupby("Supplier").sum("Units")
groupby_supplier_in
| Sku | Units | Price | Margin | Store_Id | |
|---|---|---|---|---|---|
| Supplier | |||||
| proveedor_1 | 384675814838 | 1.786670e+07 | 2.026823e+06 | 1.485068e+06 | 20509695 |
| proveedor_2 | 6999680323250 | 1.457777e+08 | 3.402684e+07 | 2.449329e+07 | 4554611598 |
Supplier_1 = groupby_supplier_in["Units"].values[0]
Supplier_2 = groupby_supplier_in["Units"].values[1]
labels = ["Supplier 2", "Supplier 1"]
piechart_figure_6labels(x1 = Supplier_2, x2 = Supplier_1, labels = labels, rooth_path = rooth_path, name_fig="\\sellin_supplier_units.png")
# Here I saved the final DataFrames cleaned to send to Data Scientists
# Then they will have to charge in a notebook as a variable to make query
df_sellin_final.to_csv("..\data\sellin_final.csv")
df_sellout_final.to_csv("..\data\sellout_final.csv")
# To make the query they will have to fill in the variables as indicated in the example
start_date = "2016-1-1"
end_date = "2016-1-31"
df = df_sellin_final # Fill it with df_sellin_final or df_sellout_final
Products = ["Crunchy Bran Mango Peach", "Chocos Mango Peach", "Lucjy Charms Chocolate", "Stars Mango Peach", "Muesli Chocolate"]
Store_id = [1668, 240, 606, 612]
df[(df['Date'] > start_date) & (df['Date'] < end_date) & (df['Sku_Description'].isin(Products)) & (df['Store_Id'].isin(Store_id))]
| Date | Sku | Sku_Description | Sku_Category | Sku_Subcategory | Units | Price | Margin | Store_Id | Supplier | Format | Town_or_City | State | Division | Distributor | Structure | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2016-01-02 | 4240039 | Lucjy Charms Chocolate | Sugar-free | Cereal bars | 48.0 | 13.74 | 9.46 | 1668 | proveedor_2 | Grocery | Enlightened Dark Empire | Magornmar | METRO-SUR | Ioalfio Ifonforge Industries | METRO |
| 828 | 2016-01-02 | 4381425 | Chocos Mango Peach | Sugar-free | Cereal bars | 600.0 | 15.18 | 10.84 | 240 | proveedor_2 | TinyShop | Rare Lemon Key | Alloralf | METRO-NORTE | Lorthogus Toughworks | METRO |
| 1556 | 2016-01-02 | 4442378 | Crunchy Bran Mango Peach | Sugar-free | Cereal bars | 36.0 | 11.09 | 5.51 | 1668 | proveedor_2 | Grocery | Enlightened Dark Empire | Magornmar | METRO-SUR | Ioalfio Ifonforge Industries | METRO |
| 1577 | 2016-01-02 | 4381425 | Chocos Mango Peach | Sugar-free | Cereal bars | 48.0 | 13.84 | 9.49 | 1668 | proveedor_2 | Grocery | Enlightened Dark Empire | Magornmar | METRO-SUR | Ioalfio Ifonforge Industries | METRO |
| 2370 | 2016-01-02 | 4554143 | Stars Mango Peach | Sugar-free | Cereal bars | 36.0 | 13.84 | 9.49 | 1668 | proveedor_2 | Grocery | Enlightened Dark Empire | Magornmar | METRO-SUR | Ioalfio Ifonforge Industries | METRO |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 113664 | 2016-01-29 | 4381425 | Chocos Mango Peach | Sugar-free | Cereal bars | 48.0 | 13.84 | 9.49 | 1668 | proveedor_2 | Grocery | Enlightened Dark Empire | Magornmar | METRO-SUR | Ioalfio Ifonforge Industries | METRO |
| 113739 | 2016-01-29 | 4250046 | Muesli Chocolate | Sugar-free | Cereal bars | 12.0 | 13.74 | 9.46 | 1668 | proveedor_2 | Grocery | Enlightened Dark Empire | Magornmar | METRO-SUR | Ioalfio Ifonforge Industries | METRO |
| 114376 | 2016-01-29 | 4381425 | Chocos Mango Peach | Sugar-free | Cereal bars | 600.0 | 15.18 | 10.84 | 240 | proveedor_2 | TinyShop | Rare Lemon Key | Alloralf | METRO-NORTE | Lorthogus Toughworks | METRO |
| 116361 | 2016-01-29 | 4240039 | Lucjy Charms Chocolate | Sugar-free | Cereal bars | 600.0 | 14.51 | 10.22 | 240 | proveedor_2 | TinyShop | Rare Lemon Key | Alloralf | METRO-NORTE | Lorthogus Toughworks | METRO |
| 116601 | 2016-01-29 | 4250046 | Muesli Chocolate | Sugar-free | Cereal bars | 408.0 | 14.51 | 10.22 | 240 | proveedor_2 | TinyShop | Rare Lemon Key | Alloralf | METRO-NORTE | Lorthogus Toughworks | METRO |
78 rows × 16 columns